## multiple imputation and additional data prep in preparation for analysis 
## some data prep on the LPP is not included in the replication archive because the LPP is not yet a publicly available dataset

## note that MI runs on the data with the PES leader and party evaluations, so these evaluations could influence our imputated pre-election evaluations.  but we don't incorporate post-election evaluations into the mclogit. it's just robustness. 

load("2015_LPP_preprocessed.RData")

number.not.missing = function(vec){sum(!is.na(vec))}

m = 5
out = Amelia::amelia(DD, m = m, idvars = c("id", "constituencynumber", "vote_party_pre", "vote_party_post", "vote_party_both", "region", "party_id", "age_cat", "income_cat", "edu_cat", "correctly_predicted_result", "attention_1"), noms = c("gender", "did.not.vote.or.dont.know.2011"), ords = c("education", "income"))


# let's use mclogit to get the appropriate weight coefficients on the different thermometer scores. 

# see utils/mclogit checking for checking how this works with fake data. 

library(mclogit)

# data needs to be restructured for this

mclogit.outs = list()
for(k in 1:length(out$imputations)){
  tD = out$imputations[[k]]
  parties = c("lib", "con", "ndp", "grn", "bq")
  vars = paste0(c("cps_party", "cps_leader", "cps_cand"), "_therm")
  mlD = data.frame(id = rep(tD$id, each = length(parties)), 
                   party = rep(parties, nrow(tD)), 
                   chosen = 0, 
                   cps_party_therm = as.vector(t(as.matrix(tD[,paste0(vars[1], "_", parties)]))),
                   cps_leader_therm = as.vector(t(as.matrix(tD[,paste0(vars[2], "_", parties)]))),
                   cps_cand_therm = as.vector(t(as.matrix(tD[,paste0(vars[3], "_", parties)]))),
                   vote_party_post = rep(tD$vote_party_both, each = length(parties)), region = rep(tD$region, each = length(parties)))
  
  mlD$chosen = as.integer(as.character(mlD$vote_party_post) == as.character(mlD$party))
  
  mlD2 = mlD[!(mlD$party == "bq" & mlD$region != "Quebec"), ] # don't let 
  
  mclogit.outs[[k]] = mclogit(cbind(chosen, id) ~ cps_party_therm + cps_leader_therm + cps_cand_therm, data = mlD2)

}

# let's look at the outputs 
weight.vec.from.mclogit.out = function(mclo){coef(mclo)/sum(coef(mclo))}
weight.mat = matrix(unlist(lapply(mclogit.outs, weight.vec.from.mclogit.out)), ncol = 3, byrow = T)

weights = apply(weight.mat, 2, mean)

## so now what do we do:
## approaches we want to take: 
  ## use the weighted preferences in the MIs to do the combined analysis
  ## just the party preferences from the MIs
  ## just the party preferences where we have enough info


## so the input for this general procedure is a list -- in each element of the list we have the id and vote info and a utility matrix (which has been boiled down in some way) plus maybe some other tie-breaking variables. 

# okay make the MI datasets that combine the utilities using the weights above
lists.for.analysis = list(
  "weighted.utilities" = list(),
  "party.utilities" = list(),
  "raw.party.utilities" = list(),
  "pes.party.utilities" = list()
)
# vars = paste0(c("cps_party", "cps_leader", "cps_cand"), "_therm")
vars.to.keep = c("id", "constituencynumber", "party_id", "vote_party_pre", "vote_party_post", "vote_party_both", "region", "did.not.vote.or.dont.know.2011", "yob", "income", "education", "gender", "age_cat", "income_cat", "edu_cat",  paste0(parties, "_therm"), "correctly_predicted_result", "attention_1")
for(k in 1:m){
  tD = tD1 = tD2 = tD3 = out$imputations[[k]]
  for(party in parties){
    tD[[paste0(party, "_therm")]] = weights[1]*tD[[paste0("cps_party_therm_", party)]] + weights[2]*tD[[paste0("cps_leader_therm_", party)]] + weights[3]*tD[[paste0("cps_leader_therm_", party)]]
    tD1[[paste0(party, "_therm")]] = tD[[paste0("cps_party_therm_", party)]]
    tD2[[paste0(party, "_therm")]] = tD[[paste0("pes_party_therm_", party)]]
    tD3[[paste0(party, "_therm")]] = tD[[paste0("pes_leader_therm_", party)]]
  }
  lists.for.analysis[["weighted.utilities"]][[k]] = tD[, vars.to.keep]
  lists.for.analysis[["party.utilities"]][[k]] = tD1[, vars.to.keep]
  lists.for.analysis[["pes.party.utilities"]][[k]] = tD2[, vars.to.keep]
  lists.for.analysis[["pes.leader.utilities"]][[k]] = tD3[, vars.to.keep]
}

# now adding tie-breaking variables.  (in cases with a tie for preferred party, we do use this.)
# first, what were they? 

DD.1 = DD
for(party in parties){
  DD.1[[paste0(party, "_therm")]] = DD.1[[paste0("cps_party_therm_", party)]]
}
nnm.1 = apply(DD.1[, paste0(parties, "_therm")], 1, number.not.missing)
lists.for.analysis[["raw.party.utilities"]][[1]] = DD.1[nnm.1 >= 3, vars.to.keep]

save(lists.for.analysis, file = "lists_for_analysis.RData")


# analysis of correlations -- this goes into the text 

library(tidyverse)
get_cor_matrix <- function(data){
  data %>%   
    select(id, starts_with("cps_")) %>% 
    pivot_longer(cols = starts_with("cps_"), names_to = "var", values_to = "value") %>% 
    separate(var, into = c("var", "party"), sep = "_therm_") %>%
    pivot_wider(names_from = var, values_from = "value") %>% 
    select(starts_with("cps")) %>% 
    cor() 
}

tibble(imputation = 1:5, data = out$imputations) %>% 
  mutate(cor_matrix = purrr::map(data, get_cor_matrix))-> cms 

mat <- (cms$cor_matrix[[1]] + cms$cor_matrix[[2]] + cms$cor_matrix[[3]] + cms$cor_matrix[[4]] + cms$cor_matrix[[5]])/5

